In this notebook we will cover an approach to do an exploratory data analysis with the data of the Brazilian general elections of 2018.

1. Obtaining the data

We will fetch the data from the oficial governemnt repository available at the webpage of the TSE (Tribunal Superior Eleitoral.)

1.1 Importing libraries

require(rvest)     # web scrapping
require(plotly)    # data visualization
require(ggplot2)   # data visualization
require(lubridate) # format dates 
require(tidyverse) # data cleansing

1.2 Set working directory

a) Variables

wd       <- '~/dev/r' # '/cloud/project'
project  <- 'brz_elections'
raw_data <- 'raw_data'

b) First level path

dest_path <- file.path(wd, project)

if (!dir.exists(dest_path)) {
  dir.create(dest_path)   # create path case it does not exist
  print(paste("Folder", dest_path , "created at", wd))
} else {
  print(paste("Folder", dest_path , "already exists"))
}
[1] "Folder ~/dev/r/brz_elections already exists"

c) Second level path

dest_path <- file.path(dest_path, raw_data)

if (!dir.exists(dest_path)) {
  dir.create(dest_path) # create directory
  print(paste("Folder ", dest_path , "created sucessfully "))
} else {
  print(paste("Folder ", dest_path , "already exists"))
}
[1] "Folder  ~/dev/r/brz_elections/raw_data already exists"

d) Set working directory

setwd(dest_path)
The working directory was changed to /home/gaston/dev/r/brz_elections inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.

1.3 Dowload source files

a) Variables

# main page with the source data
target_url <- "http://www.tse.jus.br/hotsites/pesquisas-eleitorais/prestacao_contas_anos/2018.html"

# get html from source URL above
html       <-  read_html(target_url)

# use css selector to just filter the path with the other url adresses
nodes      <-  html_nodes(html, 'div p a')

# final fetch of the url addresses with the raw data
zip_url    <-  html_attr(nodes, 'href')

b) Download files

# now need to do a for loop in this case
# r handles the download of the whole vector
# *** WARNING ***
# some files are huge - >= 200 MB
download.file(url = zip_url, 
              destfile = file.path(dest_path, basename(zip_url)),
              mode = 'wb')
trying URL 'http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/prestacao_de_contas_eleitorais_orgaos_partidarios_2018.zip'
trying URL 'http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/prestacao_de_contas_eleitorais_candidatos_2018.zip'
trying URL 'http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/CNPJ_campanha_2018.zip'

c) Unziping


for (f in file.path(dest_path, basename(zip_url))) {
  print(f)
  unzip(f, exdir = dest_path)
}
[1] "~/dev/r/brz_elections/raw_data/prestacao_de_contas_eleitorais_orgaos_partidarios_2018.zip"
error 1 in extracting from zip file
[1] "~/dev/r/brz_elections/raw_data/prestacao_de_contas_eleitorais_candidatos_2018.zip"
[1] "~/dev/r/brz_elections/raw_data/CNPJ_campanha_2018.zip"
error 1 in extracting from zip file

d) Delete some files

# delete redundant files that will not be used
pattern         <- '*BRASIL*'
files_to_remove <- grep(dir(dest_path), pattern = pattern, inv=T, value = T)
file.remove(file.path(dest_path,files_to_remove))
cannot remove file '~/dev/r/brz_elections/raw_data/a', reason 'Directory not empty'
 [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[20]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

2. Loading and cleaning the data

2.1 Load and clean downloaded csvs

a) Revenues

b) Expenses paid

c) Expenses hired

despesa_c <-  read.csv(file.path(dest_path, 'despesas_contratadas_candidatos_2018_BRASIL.csv'),
                    sep = ';', 
                    dec = ',',
                    encoding = 'latin1')

rm_columns <- c('NR_CPF_VICE_CANDIDATO','SG_UF','CD_ELEICAO','DS_ELEICAO',
                'CD_TIPO_ELEICAO','NM_TIPO_ELEICAO','ANO_ELEICAO','DT_GERACAO',
                'HH_GERACAO','TP_PRESTACAO_CONTAS','SQ_PRESTADOR_CONTAS','NR_CPF_CANDIDATO',
                'NM_PARTIDO','NR_PARTIDO','DS_CNAE_FORNECEDOR','CD_TIPO_FORNECEDOR','CD_CNAE_FORNECEDOR',
                'DS_ESFERA_PART_FORNECEDOR','CD_ESFERA_PART_FORNECEDOR','SG_UF_FORNECEDOR',
                'CD_MUNICIPIO_FORNECEDOR','NM_MUNICIPIO_FORNECEDOR','SQ_CANDIDATO_FORNECEDOR',
                'NR_CANDIDATO_FORNECEDOR','CD_CARGO_FORNECEDOR','DS_ORIGEM_DESPESA',
                'CD_ORIGEM_DESPESA','SQ_PARCELAMENTO_DESPESA')

despesa_c[, rm_columns] <-NULL

d) Backup clean frames

saveRDS(receita,   file.path(dest_path, 'clean_revenue.rds'))
saveRDS(despesa,   file.path(dest_path, 'clean_expenses_paid.rds'))
saveRDS(despesa_c, file.path(dest_path, 'clean_expenses_hired.rds'))

2.2 Download candidate’s assets data

a) Download

b) Clean

c) Backup

---
title: "*Brazilian 2018 Elections EDA*"
# author: "Gaston Guillaux"
# date: "2020-01-26"
output: html_notebook
---

In this notebook we will cover an approach to do an exploratory data analysis with the data of the Brazilian general elections of 2018.

## **1. Obtaining the data**
We will fetch the data from the oficial governemnt repository available at the webpage of the TSE (Tribunal Superior Eleitoral.)

* **Main URL** - http://www.tse.jus.br/eleicoes/estatisticas/repositorio-de-dados-eleitorais-1/repositorio-de-dados-eleitorais
* **Main frame from above URL** - http://www.tse.jus.br/hotsites/pesquisas-eleitorais/index.html
* **Tab with the accountability data** - http://www.tse.jus.br/hotsites/pesquisas-eleitorais/prestacao_contas.html
* **Final link with the 2018 data** - http://www.tse.jus.br/hotsites/pesquisas-eleitorais/prestacao_contas_anos/2018.html

### 1.1 Importing libraries
```{r}
require(rvest)     # web scrapping
require(plotly)    # data visualization
require(ggplot2)   # data visualization
require(lubridate) # format dates 
require(tidyverse) # data cleansing
```

### 1.2 Set working directory {.tabset}
#### **a) Variables**
```{r}
wd       <- '~/dev/r' # '/cloud/project'
project  <- 'brz_elections'
raw_data <- 'raw_data'
```

#### **b) First level path**
```{r}
dest_path <- file.path(wd, project)

if (!dir.exists(dest_path)) {
  dir.create(dest_path)   # create path case it does not exist
  print(paste("Folder", dest_path , "created at", wd))
} else {
  print(paste("Folder", dest_path , "already exists"))
}
```

#### **c) Second level path**
```{r}
dest_path <- file.path(dest_path, raw_data)

if (!dir.exists(dest_path)) {
  dir.create(dest_path) # create directory
  print(paste("Folder ", dest_path , "created sucessfully "))
} else {
  print(paste("Folder ", dest_path , "already exists"))
}
```

#### **d) Set working directory**
```{r}
setwd(dest_path)
```

### 1.3 Dowload source files {.tabset}
#### **a) Variables**
```{r}
# main page with the source data
target_url <- "http://www.tse.jus.br/hotsites/pesquisas-eleitorais/prestacao_contas_anos/2018.html"

# get html from source URL above
html       <-  read_html(target_url)

# use css selector to just filter the path with the other url adresses
nodes      <-  html_nodes(html, 'div p a')

# final fetch of the url addresses with the raw data
zip_url    <-  html_attr(nodes, 'href')
```

#### **b) Download files**
```{r}
# now need to do a for loop in this case
# r handles the download of the whole vector
# *** WARNING ***
# some files are huge - >= 200 MB
download.file(url = zip_url, 
              destfile = file.path(dest_path, basename(zip_url)),
              mode = 'wb')
```

#### **c) Unziping**
```{r}

for (f in file.path(dest_path, basename(zip_url))) {
  print(f)
  unzip(f, exdir = dest_path)
}

```

#### **d) Delete some files**
```{r}
# delete redundant files that will not be used
pattern         <- '*BRASIL*'
files_to_remove <- grep(dir(dest_path), pattern = pattern, inv=T, value = T)
file.remove(file.path(dest_path,files_to_remove))
```


## **2. Loading and cleaning the data**

### 2.1 Load and clean downloaded csvs {.tabset}
#### **a) Revenues**
```{r}
# read revenues csv
receita <-  read.csv(file.path(dest_path, 'receitas_candidatos_2018_BRASIL.csv'),
                    sep = ';', 
                    dec = ',',
                    encoding = 'latin1')

# we have 57 columns! 
# to keep things simple let's remove the ones we know are not useful right now
receita$SG_UF                       <- NULL
receita$NR_CPF_VICE_CANDIDATO       <- NULL
receita$CD_ELEICAO                  <- NULL
receita$DS_ELEICAO                  <- NULL
receita$CD_TIPO_ELEICAO             <- NULL
receita$NM_TIPO_ELEICAO             <- NULL
receita$ANO_ELEICAO                 <- NULL
receita$DT_GERACAO                  <- NULL
receita$HH_GERACAO                  <- NULL
receita$TP_PRESTACAO_CONTAS         <- NULL
receita$DS_CARGO_CANDIDATO_DOADOR   <- NULL
receita$CD_ESFERA_PARTIDARIA_DOADOR <- NULL
receita$DS_ESFERA_PARTIDARIA_DOADOR <- NULL
receita$NM_DOADOR                   <- NULL
receita$CD_MUNICIPIO_DOADOR         <- NULL
receita$NM_MUNICIPIO_DOADOR         <- NULL        
receita$SQ_CANDIDATO_DOADOR         <- NULL
receita$NR_CANDIDATO_DOADOR         <- NULL
receita$NR_DOCUMENTO_DOACAO         <- NULL
receita$SQ_PRESTADOR_CONTAS         <- NULL
receita$SQ_RECEITA                  <- NULL
receita$CD_CARGO_CANDIDATO_DOADOR   <- NULL
receita$NR_CPF_CANDIDATO            <- NULL
receita$NR_CPF_CNPJ_DOADOR          <- NULL
receita$NM_PARTIDO                  <- NULL
receita$NR_PARTIDO_DOADOR           <- NULL
receita$NM_PARTIDO_DOADOR           <- NULL
receita$NR_PARTIDO                  <- NULL
receita$CD_FONTE_RECEITA            <- NULL
receita$CD_ORIGEM_RECEITA           <- NULL
receita$CD_NATUREZA_RECEITA         <- NULL
receita$CD_ESPECIE_RECEITA          <- NULL
receita$CD_CNAE_DOADOR              <- NULL
receita$SG_PARTIDO_DOADOR           <- NULL

# format dates using lubridate
receita$DT_ELEICAO            <- dmy(receita$DT_ELEICAO)
receita$DT_PRESTACAO_CONTAS   <- dmy(receita$DT_PRESTACAO_CONTAS)
receita$DT_RECEITA            <- dmy(receita$DT_RECEITA)

# display head
head(receita)
```

#### **b) Expenses paid**
```{r}
# another way to do the cleansing in r, now using expenses data
despesa <-  read.csv(file.path(dest_path, 'despesas_pagas_candidatos_2018_BRASIL.csv'),
                    sep = ';', 
                    dec = ',',
                    encoding = 'latin1')

rm_columns <- c('NR_CPF_VICE_CANDIDATO','SG_UF','CD_ELEICAO','DS_ELEICAO',
                'CD_TIPO_ELEICAO','NM_TIPO_ELEICAO','ANO_ELEICAO','DT_GERACAO',
                'HH_GERACAO','TP_PRESTACAO_CONTAS','SQ_PRESTADOR_CONTAS','NR_CPF_CANDIDATO',
                'NM_PARTIDO','NR_PARTIDO','DS_CNAE_FORNECEDOR','CD_TIPO_FORNECEDOR','CD_CNAE_FORNECEDOR',
                'DS_ESFERA_PART_FORNECEDOR','CD_ESFERA_PART_FORNECEDOR','SG_UF_FORNECEDOR',
                'CD_MUNICIPIO_FORNECEDOR','NM_MUNICIPIO_FORNECEDOR','SQ_CANDIDATO_FORNECEDOR',
                'NR_CANDIDATO_FORNECEDOR','CD_CARGO_FORNECEDOR','DS_ORIGEM_DESPESA',
                'CD_ORIGEM_DESPESA','SQ_PARCELAMENTO_DESPESA')

despesa[, rm_columns] <-NULL
head(despesa)
```

#### **c) Expenses hired**
```{r}
despesa_c <-  read.csv(file.path(dest_path, 'despesas_contratadas_candidatos_2018_BRASIL.csv'),
                    sep = ';', 
                    dec = ',',
                    encoding = 'latin1')

rm_columns <- c('NR_CPF_VICE_CANDIDATO','SG_UF','CD_ELEICAO','DS_ELEICAO',
                'CD_TIPO_ELEICAO','NM_TIPO_ELEICAO','ANO_ELEICAO','DT_GERACAO',
                'HH_GERACAO','TP_PRESTACAO_CONTAS','SQ_PRESTADOR_CONTAS','NR_CPF_CANDIDATO',
                'NM_PARTIDO','NR_PARTIDO','DS_CNAE_FORNECEDOR','CD_TIPO_FORNECEDOR','CD_CNAE_FORNECEDOR',
                'DS_ESFERA_PART_FORNECEDOR','CD_ESFERA_PART_FORNECEDOR','SG_UF_FORNECEDOR',
                'CD_MUNICIPIO_FORNECEDOR','NM_MUNICIPIO_FORNECEDOR','SQ_CANDIDATO_FORNECEDOR',
                'NR_CANDIDATO_FORNECEDOR','CD_CARGO_FORNECEDOR','DS_ORIGEM_DESPESA',
                'CD_ORIGEM_DESPESA','SQ_PARCELAMENTO_DESPESA')


despesa_c[, rm_columns] <-NULL
head(despesa_c)
```

#### **d) Backup clean frames**
```{r}
saveRDS(receita,   file.path(dest_path, 'clean_revenue.rds'))
saveRDS(despesa,   file.path(dest_path, 'clean_expenses_paid.rds'))
saveRDS(despesa_c, file.path(dest_path, 'clean_expenses_hired.rds'))
```


### 2.2 Download candidate's assets data {.tabset}
#### **a) Download**
```{r}
```

#### **b) Clean**
```{r}
```

#### **c) Backup**
```{r}

```